import asyncio # 异步
import looter as lt # pip install looter
import random
from parsel import Selector
import json
import requests
import time
import pymysql
from DBUtils.PersistentDB import PersistentDB
domain = 'https://www.cnblogs.com/'

headers={
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.63 Safari/537.36 Qiyu/2.1.1.1',
    'referer':'https://www.cnblogs.com/',
    'cookie':'__gads=ID=9438c1cd86c17866:T=1543647755:S=ALNI_Mbi3TShe973wtyvo52ld2uDpxPKaA; _ga=GA1.2.1162600031.1543647753; _gid=GA1.2.1552287033.1557489053'
}

# parsel 有一个叫做Selector的类
async def crawl(url):
    tree = await lt.async_fetch(url)
    items = tree.xpath('//a[@class="titlelnk"]/@href').extract()
    return items

async def crawl_info(urls):
    conn = pool1.connection()
    cursor = conn.cursor()
    author_word_list=set()
    for url in urls:
        result={}
        tree_info = lt.fetch(url)
        result['title'] = tree_info.xpath('//a[@id="cb_post_title_url"]/text()').extract_first()
        info = tree_info.xpath('string(//div/small|//div[@class="postDesc"]|//*[@class="postfoot"])').extract_first()
        # date_re=re.compile('\d{4}-\d{2}-\d{2} \d{2}:\d{2}')
        # fbrq=date_re.search(info).group()[0]
        info_list = info.strip().split()
        if 'by' in info_list:
            result['fbrq'] = info_list[0]
            result['author'] = info_list[3]
        else:
            result['fbrq'] = info_list[2]
            result['author'] = info_list[4]
        # 如下内容都需要听过动态请求获取 requests
        script_text = tree_info.xpath('//script[2]/text()').extract_first()
        # blogId 需要到页面中的js中去提取
        blogId = script_text.split(';')[0].split('=')[-1]
        blogApp = script_text.split(';')[1].split(',')[0].split('=')[-1].strip("'")
        #print(script_text)
        # postId 可以通过解析url获取
        # https://www.cnblogs.com/shine-lee/p/10717521.html
        postId = url.split('/')[-1].split('.')[0]
        _ = int(time.time() * 1000)  # 1556110169047
        time.sleep(random.randint(1,10))
        res = json.loads(requests.get(f'https://www.cnblogs.com/mvc/blog/CategoriesTags.aspx?blogApp={blogApp}&blogId={blogId}&postId={postId}&_={_}',headers=headers).text)
        result['category'] = ','.join(Selector(res['Categories']).xpath('//a/text()').extract())
        result['labels'] = ','.join(Selector(res['Tags']).xpath('//a/text()').extract())
        result['read_num'] = requests.get(f'https://www.cnblogs.com/mvc/blog/ViewCountCommentCout.aspx?postId={postId}',headers=headers
                                          ).text.strip()
        result['reply_num'] = json.loads(requests.get(
            f'https://www.cnblogs.com/mvc/blog/GetComments.aspx?postId={postId}&blogApp={blogApp}&pageIndex=0&anchorCommentId=0&_={_}',headers=headers).text)[
            'commentCount']
        result['contents']=tree_info.xpath('//*[@id="cnblogs_post_body"]').extract_first()
        sql = 'insert into cnblogs values(NULL,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
        cursor.execute(sql, (result['title'], result['contents'], result['author'], result['category'], result['labels'], result['fbrq'],result['read_num'],result['reply_num'],url))
        conn.commit()
        author_word_list.add(tree_info.xpath('//*[@id="Header1_HeaderTitle"]/@href').extract_first())
    cursor.close()
    conn.close()
    return author_word_list

async def crawl_author_word(urls):
    word_list=[]
    for url in urls:
        while True:
            tree_info = lt.fetch(url)
            word_list.extend(tree_info.xpath('//div[@class="postTitle"]/a/@href').extract())
            nav_next_page=tree_info.xpath('//*[@id="nav_next_page"]/@href').extract_first()
            if nav_next_page:
                url=nav_next_page
            else:
                break
    return word_list

if __name__ == '__main__':
    pool1 = PersistentDB(creator=pymysql, maxusage=None, ping=0, closeable=False, host='127.0.0.1', user='root',
                         password='root', db='blog_spider', charset='utf8')
    tasklist = [f'{domain}sitehome/p/{i}' for i in range(1,201)]
    loop = asyncio.get_event_loop()
    result = [crawl(task) for task in tasklist]
    result_task=loop.run_until_complete(asyncio.wait(result))
    result_info=[crawl_info(task.result()) for task in result_task[0]]
    other_task=loop.run_until_complete(asyncio.wait(result_info))
    result = [crawl_author_word(task.result()) for task in other_task[0]]
    # 每个作者的其他文章的列表
    result_task = loop.run_until_complete(asyncio.wait(result))
    result_info = [crawl_info(task.result()) for task in result_task[0]]
    loop.run_until_complete(asyncio.wait(result_info))
